library(reticulate)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(quanteda)
## Package version: 2.1.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## The following objects are masked from 'package:quanteda':
##
## meta, meta<-
##
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
##
## as.DocumentTermMatrix, stopwords
library(topicmodels)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x NLP::annotate() masks ggplot2::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(wordcloud)
## Loading required package: RColorBrewer
library(gutenbergr)
library(textclean)
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
library(parallel)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
##
## tokenize
## The following objects are masked from 'package:quanteda':
##
## tokens, types
library(gmodels)
use_condaenv()
source_python("py_script.py")
source('Rfunctions.R')
news <- readRDS('data/BBC_Reuters_GoogleNews_articles_business.rds')
article <- news$Body[30000]
article
## [1] "LOS ANGELES (Reuters) - The union representing U.S. screenwriters called for a strike against film and TV studios starting on Monday in a move giving negotiators one last weekend to reach a contract deal or shatter 20 years of Hollywood labor peace. \n\n The strike deadline was issued on Friday, a day after a three-year contract covering the 12,000-member Writers Guild of America expired. It follows months of talks that deadlocked over the union's demands for a greater share of DVD and Internet revenues. Each side has accused the other of stonewalling and refusing to budge from unreasonable proposals. Union negotiators urged a walkout during a boisterous membership meeting on Thursday night, and the Writers Guild's governing board voted to ratify that recommendation. Hours later, a studio spokesman said the two sides had scheduled a meeting for 10 a.m. Sunday. Union leaders said at an afternoon news conference there still was time to avoid a strike that, if prolonged, could cost hundreds of millions of dollars in lost revenues and wages. \"We have 48 hours and what we really want to do is negotiate,\" said John Bowman, chairman of the union's negotiating committee. He said that while reluctant to go on strike, the Writers Guild felt it had to act decisively. \"We have to inflict as much damage as quickly as possible in order to get this thing over,\" Bowman said. The Alliance of Motion Picture and Television Producers, the bargaining arm of the studios, offered a statement by the group's president, Nick Counter, calling the Writers Guild's move toward a strike \"precipitous and irresponsible.\" \"Our goal continues to be to reach a fair and reasonable agreement that will keep the industry working,\" he said. Union officials said that barring a last-minute deal, the strike would begin at 3:01 a.m. EST and picket lines would go up in Los Angeles and New York City. $1 BILLION AT RISK The last major Hollywood strike was a Writers Guild walkout in 1988 that lasted 22 weeks, delayed the start of the fall TV season and cost the industry an estimated $500 million. Los Angeles economist Jack Kyser said a strike of the same duration now could result in at least $1 billion in economic losses. The union says the overall compensation package sought by writers would cost $220 million over three years, a fraction of the $24.4 billion in revenues generated by U.S. DVD sales and rentals last year alone, according to accounting firm PricewaterhouseCoopers. A writers' strike would be little noticed by movie and TV audiences at first. Film studios' screenplay pipeline is well-stocked through 2008. And producers of prime-time sitcoms and dramas are said to have stockpiled enough advance episodes to keep their shows on the air until January or February. But late-night talk shows will go off the air almost immediately since they rely on a daily supply of topical jokes. On his CBS show on Thursday, David Letterman described the producers as \"cowards, cutthroats and weasels.\" Prime-time schedules will start filling up with more reruns and game shows after the networks have burned through fresh episodes. The new shows fighting to hold viewers' attention in the first few weeks of the new season face a grim future if they have to leave the schedule for an extended period. Negotiations on a new writers' contract began in July and the two sides have remained far apart. They brought in a federal mediator this week to try to break the deadlock on the key issue of compensating writers for the reuse of their work in various digital formats. The studios have said union demands for higher residuals on DVDs and Internet downloads would stifle growth at a time of rising production costs, tighter profits and piracy. They insist digital distribution of movies and TV remains largely experimental or promotional and new media is just developing. The union accuses studios of pleading poverty and argues that writers have never had a fair deal on lucrative DVDs. They also see more film and TV migrating toward the Internet and wireless platforms and want a bigger share of that revenue. (Additional writing by Dean Goodman)"
Startover
art_parg= data.frame(
paragraph_text =unlist(tokenize_sentence(article))
) %>%
rowid_to_column(var = "paragraph_num")
Extract_Named_Entities(art_parg)
## paragraph_num Named.Entity
## 1 1 LOS ANGELES
## 2 1 Reuters
## 3 1 U.S.
## 4 1 Monday
## 5 1 one
## 6 1 last weekend
## 7 1 20 years
## 8 1 Hollywood
## 9 2 Friday
## 10 2 a day
## 11 2 three-year
## 12 2 12,000-member
## 13 2 Writers Guild of America
## 14 5 Thursday
## 15 5 night
## 16 5 the Writers Guild's
## 17 6 two
## 18 6 10 a.m.
## 19 7 Sunday
## 20 8 afternoon
## 21 8 hundreds of millions of dollars
## 22 9 48 hours
## 23 9 John Bowman
## 24 10 the Writers Guild
## 25 11 Bowman
## 26 12 The Alliance of Motion Picture and Television Producers
## 27 12 Nick Counter
## 28 12 the Writers Guild's
## 29 14 last-minute
## 30 14 3:01 a.m.
## 31 15 EST
## 32 15 Los Angeles
## 33 15 New York City
## 34 16 $1 BILLION
## 35 16 Hollywood
## 36 16 Writers Guild
## 37 16 1988
## 38 16 22 weeks
## 39 16 the start of the fall TV season
## 40 16 an estimated $500 million
## 41 17 Los Angeles
## 42 17 Jack Kyser
## 43 17 at least $1 billion
## 44 18 $220 million
## 45 18 three years
## 46 18 $24.4 billion
## 47 18 U.S.
## 48 19 last year
## 49 19 PricewaterhouseCoopers
## 50 20 first
## 51 21 2008
## 52 22 January
## 53 22 February
## 54 23 late-night
## 55 23 daily
## 56 24 CBS
## 57 24 Thursday
## 58 24 David Letterman
## 59 26 the first few weeks
## 60 26 the new season
## 61 27 July
## 62 27 two
## 63 28 this week
## 64 33 Dean Goodman
## Label NamedEntity
## 1 GPE LOSANGELES
## 2 ORG Reuters
## 3 GPE U.S.
## 4 DATE Monday
## 5 CARDINAL one
## 6 DATE lastweekend
## 7 DATE 20years
## 8 GPE Hollywood
## 9 DATE Friday
## 10 DATE aday
## 11 DATE three-year
## 12 DATE 12,000-member
## 13 ORG WritersGuildofAmerica
## 14 DATE Thursday
## 15 TIME night
## 16 ORG WritersGuild
## 17 CARDINAL two
## 18 TIME 10a.m.
## 19 DATE Sunday
## 20 TIME afternoon
## 21 MONEY hundredsofmillionsofdollars
## 22 TIME 48hours
## 23 PERSON JohnBowman
## 24 ORG WritersGuild
## 25 PERSON Bowman
## 26 ORG AllianceofMotionPictureandTelevisionProducers
## 27 PERSON NickCounter
## 28 ORG WritersGuild
## 29 TIME last-minute
## 30 TIME 3:01a.m.
## 31 ORG EST
## 32 GPE LosAngeles
## 33 GPE NewYorkCity
## 34 MONEY $1BILLION
## 35 GPE Hollywood
## 36 ORG WritersGuild
## 37 DATE 1988
## 38 DATE 22weeks
## 39 DATE startofthefallTVseason
## 40 MONEY anestimated$500million
## 41 GPE LosAngeles
## 42 PERSON JackKyser
## 43 MONEY atleast$1billion
## 44 MONEY $220million
## 45 DATE threeyears
## 46 MONEY $24.4billion
## 47 GPE U.S.
## 48 DATE lastyear
## 49 ORG PricewaterhouseCoopers
## 50 ORDINAL first
## 51 DATE 2008
## 52 DATE January
## 53 DATE February
## 54 TIME late-night
## 55 DATE daily
## 56 ORG CBS
## 57 DATE Thursday
## 58 PERSON DavidLetterman
## 59 DATE firstfewweeks
## 60 DATE newseason
## 61 DATE July
## 62 CARDINAL two
## 63 DATE thisweek
## 64 PERSON DeanGoodman
art_rm_NE = NE_Cleansing(art_parg, 'paragraph_num', 'paragraph_text', group = TRUE, rm=FALSE, Extract_Named_Entities(art_parg) %>% filter(Label %in% c("GPE", "ORG", "PERSON","LOC",'NORP')) %>% select(-Label) %>% unique())
art_rm_NE
## paragraph_num
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
## 11 11
## 12 12
## 13 13
## 14 14
## 15 15
## 16 16
## 17 17
## 18 18
## 19 19
## 20 20
## 21 21
## 22 22
## 23 23
## 24 24
## 25 25
## 26 26
## 27 27
## 28 28
## 29 29
## 30 30
## 31 31
## 32 32
## 33 33
## paragraph_text
## 1 LOS ANGELES (Reuters) - The union representing U.S. screenwriters called for a strike against film and TV studios starting on Monday in a move giving negotiators one last weekend to reach a contract deal or shatter 20 years of Hollywood labor peace.
## 2 The strike deadline was issued on Friday, a day after a three-year contract covering the 12,000-member Writers Guild of America expired.
## 3 It follows months of talks that deadlocked over the union's demands for a greater share of DVD and Internet revenues.
## 4 Each side has accused the other of stonewalling and refusing to budge from unreasonable proposals.
## 5 Union negotiators urged a walkout during a boisterous membership meeting on Thursday night, and the Writers Guild's governing board voted to ratify that recommendation.
## 6 Hours later, a studio spokesman said the two sides had scheduled a meeting for 10 a.m.
## 7 Sunday.
## 8 Union leaders said at an afternoon news conference there still was time to avoid a strike that, if prolonged, could cost hundreds of millions of dollars in lost revenues and wages.
## 9 "We have 48 hours and what we really want to do is negotiate," said John Bowman, chairman of the union's negotiating committee.
## 10 He said that while reluctant to go on strike, the Writers Guild felt it had to act decisively.
## 11 "We have to inflict as much damage as quickly as possible in order to get this thing over," Bowman said.
## 12 The Alliance of Motion Picture and Television Producers, the bargaining arm of the studios, offered a statement by the group's president, Nick Counter, calling the Writers Guild's move toward a strike "precipitous and irresponsible."
## 13 "Our goal continues to be to reach a fair and reasonable agreement that will keep the industry working," he said.
## 14 Union officials said that barring a last-minute deal, the strike would begin at 3:01 a.m.
## 15 EST and picket lines would go up in Los Angeles and New York City.
## 16 $1 BILLION AT RISK The last major Hollywood strike was a Writers Guild walkout in 1988 that lasted 22 weeks, delayed the start of the fall TV season and cost the industry an estimated $500 million.
## 17 Los Angeles economist Jack Kyser said a strike of the same duration now could result in at least $1 billion in economic losses.
## 18 The union says the overall compensation package sought by writers would cost $220 million over three years, a fraction of the $24.4 billion in revenues generated by U.S.
## 19 DVD sales and rentals last year alone, according to accounting firm PricewaterhouseCoopers.
## 20 A writers' strike would be little noticed by movie and TV audiences at first.
## 21 Film studios' screenplay pipeline is well-stocked through 2008.
## 22 And producers of prime-time sitcoms and dramas are said to have stockpiled enough advance episodes to keep their shows on the air until January or February.
## 23 But late-night talk shows will go off the air almost immediately since they rely on a daily supply of topical jokes.
## 24 On his CBS show on Thursday, David Letterman described the producers as "cowards, cutthroats and weasels."
## 25 Prime-time schedules will start filling up with more reruns and game shows after the networks have burned through fresh episodes.
## 26 The new shows fighting to hold viewers' attention in the first few weeks of the new season face a grim future if they have to leave the schedule for an extended period.
## 27 Negotiations on a new writers' contract began in July and the two sides have remained far apart.
## 28 They brought in a federal mediator this week to try to break the deadlock on the key issue of compensating writers for the reuse of their work in various digital formats.
## 29 The studios have said union demands for higher residuals on DVDs and Internet downloads would stifle growth at a time of rising production costs, tighter profits and piracy.
## 30 They insist digital distribution of movies and TV remains largely experimental or promotional and new media is just developing.
## 31 The union accuses studios of pleading poverty and argues that writers have never had a fair deal on lucrative DVDs.
## 32 They also see more film and TV migrating toward the Internet and wireless platforms and want a bigger share of that revenue.
## 33 (Additional writing by Dean Goodman)
## TEXT
## 1 LOSANGELES(Reuters) - The union representing U.S. screenwriters called for a strike against film and TV studios starting on Monday in a move giving negotiators one last weekend to reach a contract deal or shatter 20 years of Hollywood labor peace.
## 2 The strike deadline was issued on Friday, a day after a three-year contract covering the 12,000-member WritersGuildofAmerica expired.
## 3 It follows months of talks that deadlocked over the union's demands for a greater share of DVD and Internet revenues.
## 4 Each side has accused the other of stonewalling and refusing to budge from unreasonable proposals.
## 5 Union negotiators urged a walkout during a boisterous membership meeting on Thursday night, and WritersGuild governing board voted to ratify that recommendation.
## 6 Hours later, a studio spokesman said the two sides had scheduled a meeting for 10 a.m.
## 7 Sunday.
## 8 Union leaders said at an afternoon news conference there still was time to avoid a strike that, if prolonged, could cost hundreds of millions of dollars in lost revenues and wages.
## 9 "We have 48 hours and what we really want to do is negotiate," said JohnBowman, chairman of the union's negotiating committee.
## 10 He said that while reluctant to go on strike, WritersGuild felt it had to act decisively.
## 11 "We have to inflict as much damage as quickly as possible in order to get this thing over," Bowman said.
## 12 AllianceofMotionPictureandTelevisionProducers, the bargaining arm of the studios, offered a statement by the group's president, NickCounter, calling WritersGuild move toward a strike "precipitous and irresponsible."
## 13 "Our goal continues to be to reach a fair and reasonable agreement that will keep the industry working," he said.
## 14 Union officials said that barring a last-minute deal, the strike would begin at 3:01 a.m.
## 15 EST and picket lines would go up in LosAngeles and NewYorkCity.
## 16 $1 BILLION AT RISK The last major Hollywood strike was a WritersGuild walkout in 1988 that lasted 22 weeks, delayed the start of the fall TV season and cost the industry an estimated $500 million.
## 17 LosAngeles economist JackKyser said a strike of the same duration now could result in at least $1 billion in economic losses.
## 18 The union says the overall compensation package sought by writers would cost $220 million over three years, a fraction of the $24.4 billion in revenues generated by U.S.
## 19 DVD sales and rentals last year alone, according to accounting firm PricewaterhouseCoopers.
## 20 A writers' strike would be little noticed by movie and TV audiences at first.
## 21 Film studios' screenplay pipeline is well-stocked through 2008.
## 22 And producers of prime-time sitcoms and dramas are said to have stockpiled enough advance episodes to keep their shows on the air until January or February.
## 23 But late-night talk shows will go off the air almost immediately since they rely on a daily supply of topical jokes.
## 24 On his CBS show on Thursday, DavidLetterman described the producers as "cowards, cutthroats and weasels."
## 25 Prime-time schedules will start filling up with more reruns and game shows after the networks have burned through fresh episodes.
## 26 The new shows fighting to hold viewers' attention in the first few weeks of the new season face a grim future if they have to leave the schedule for an extended period.
## 27 Negotiations on a new writers' contract began in July and the two sides have remained far apart.
## 28 They brought in a federal mediator this week to try to break the deadlock on the key issue of compensating writers for the reuse of their work in various digital formats.
## 29 The studios have said union demands for higher residuals on DVDs and Internet downloads would stifle growth at a time of rising production costs, tighter profits and piracy.
## 30 They insist digital distribution of movies and TV remains largely experimental or promotional and new media is just developing.
## 31 The union accuses studios of pleading poverty and argues that writers have never had a fair deal on lucrative DVDs.
## 32 They also see more film and TV migrating toward the Internet and wireless platforms and want a bigger share of that revenue.
## 33 (Additional writing by DeanGoodman)
df = art_rm_NE %>%
select(paragraph_num, TEXT) %>%
unnest_tokens(input = TEXT, output = word) %>%
mutate(word = str_remove_all(word,"'s$")) %>%
mutate(word = str_remove_all(word,"^the"))%>%
mutate(word = str_remove_all(word,"^The"))%>%
mutate(word = textstem::lemmatize_words(word)) %>%
mutate(word= tolower(word)) %>%
filter(!str_detect(word, '^\\d')) %>%
filter(!str_detect(word, '^\\d[a-z][a-z]')) %>%
anti_join(stop_words) %>%
filter(nchar(word) > 2) %>%
purrr::set_names('id','word') %>%
dplyr::count(id, word)
## Joining, by = "word"
df
## id word n
## 1 1 call 1
## 2 1 contract 1
## 3 1 deal 1
## 4 1 film 1
## 5 1 hollywood 1
## 6 1 labor 1
## 7 1 losangeles 1
## 8 1 monday 1
## 9 1 move 1
## 10 1 negotiator 1
## 11 1 peace 1
## 12 1 reach 1
## 13 1 represent 1
## 14 1 reuters 1
## 15 1 screenwriter 1
## 16 1 shatter 1
## 17 1 start 1
## 18 1 strike 1
## 19 1 studio 1
## 20 1 u.s 1
## 21 1 union 1
## 22 1 weekend 1
## 23 2 contract 1
## 24 2 cover 1
## 25 2 day 1
## 26 2 deadline 1
## 27 2 expire 1
## 28 2 friday 1
## 29 2 issue 1
## 30 2 strike 1
## 31 2 writersguildofamerica 1
## 32 3 deadlock 1
## 33 3 demand 1
## 34 3 dvd 1
## 35 3 follow 1
## 36 3 internet 1
## 37 3 month 1
## 38 3 revenue 1
## 39 3 share 1
## 40 3 talk 1
## 41 3 union 1
## 42 4 accuse 1
## 43 4 budge 1
## 44 4 proposal 1
## 45 4 refuse 1
## 46 4 stonewall 1
## 47 4 unreasonable 1
## 48 5 board 1
## 49 5 boisterous 1
## 50 5 govern 1
## 51 5 meet 1
## 52 5 membership 1
## 53 5 negotiator 1
## 54 5 night 1
## 55 5 ratify 1
## 56 5 recommendation 1
## 57 5 thursday 1
## 58 5 union 1
## 59 5 urge 1
## 60 5 vote 1
## 61 5 walkout 1
## 62 5 writersguild 1
## 63 6 a.m 1
## 64 6 hour 1
## 65 6 late 1
## 66 6 meet 1
## 67 6 schedule 1
## 68 6 spokesman 1
## 69 6 studio 1
## 70 7 sunday 1
## 71 8 afternoon 1
## 72 8 avoid 1
## 73 8 conference 1
## 74 8 cost 1
## 75 8 dollar 1
## 76 8 hundred 1
## 77 8 leader 1
## 78 8 lose 1
## 79 8 million 1
## 80 8 news 1
## 81 8 prolong 1
## 82 8 revenue 1
## 83 8 strike 1
## 84 8 time 1
## 85 8 union 1
## 86 8 wage 1
## 87 9 chairman 1
## 88 9 committee 1
## 89 9 hour 1
## 90 9 johnbowman 1
## 91 9 negotiate 2
## 92 9 union 1
## 93 10 act 1
## 94 10 decisively 1
## 95 10 feel 1
## 96 10 reluctant 1
## 97 10 strike 1
## 98 10 writersguild 1
## 99 11 bowman 1
## 100 11 damage 1
## 101 11 inflict 1
## 102 11 quickly 1
## 103 12 allianceofmotionpictureandtelevisionproducers 1
## 104 12 arm 1
## 105 12 bargain 1
## 106 12 call 1
## 107 12 irresponsible 1
## 108 12 move 1
## 109 12 nickcounter 1
## 110 12 offer 1
## 111 12 precipitous 1
## 112 12 president 1
## 113 12 statement 1
## 114 12 strike 1
## 115 12 studio 1
## 116 12 writersguild 1
## 117 13 agreement 1
## 118 13 continue 1
## 119 13 fair 1
## 120 13 goal 1
## 121 13 industry 1
## 122 13 reach 1
## 123 13 reasonable 1
## 124 14 a.m 1
## 125 14 bar 1
## 126 14 begin 1
## 127 14 deal 1
## 128 14 minute 1
## 129 14 official 1
## 130 14 strike 1
## 131 14 union 1
## 132 15 est 1
## 133 15 line 1
## 134 15 losangeles 1
## 135 15 newyorkcity 1
## 136 15 picket 1
## 137 16 billion 1
## 138 16 cost 1
## 139 16 delay 1
## 140 16 estimate 1
## 141 16 fall 1
## 142 16 hollywood 1
## 143 16 industry 1
## 144 16 major 1
## 145 16 million 1
## 146 16 risk 1
## 147 16 season 1
## 148 16 start 1
## 149 16 strike 1
## 150 16 walkout 1
## 151 16 week 1
## 152 16 writersguild 1
## 153 17 billion 1
## 154 17 duration 1
## 155 17 economic 1
## 156 17 economist 1
## 157 17 jackkyser 1
## 158 17 losangeles 1
## 159 17 loss 1
## 160 17 result 1
## 161 17 strike 1
## 162 18 billion 1
## 163 18 compensation 1
## 164 18 cost 1
## 165 18 fraction 1
## 166 18 generate 1
## 167 18 million 1
## 168 18 package 1
## 169 18 revenue 1
## 170 18 seek 1
## 171 18 u.s 1
## 172 18 union 1
## 173 18 writer 1
## 174 19 accord 1
## 175 19 account 1
## 176 19 dvd 1
## 177 19 firm 1
## 178 19 pricewaterhousecoopers 1
## 179 19 rental 1
## 180 19 sale 1
## 181 20 audience 1
## 182 20 movie 1
## 183 20 notice 1
## 184 20 strike 1
## 185 20 writer 1
## 186 21 film 1
## 187 21 pipeline 1
## 188 21 screenplay 1
## 189 21 stock 1
## 190 21 studio 1
## 191 22 advance 1
## 192 22 air 1
## 193 22 drama 1
## 194 22 episode 1
## 195 22 february 1
## 196 22 january 1
## 197 22 prime 1
## 198 22 producer 1
## 199 22 sitcom 1
## 200 22 stockpile 1
## 201 22 time 1
## 202 23 air 1
## 203 23 daily 1
## 204 23 immediately 1
## 205 23 joke 1
## 206 23 late 1
## 207 23 night 1
## 208 23 rely 1
## 209 23 supply 1
## 210 23 talk 1
## 211 23 topical 1
## 212 24 cbs 1
## 213 24 coward 1
## 214 24 cutthroats 1
## 215 24 davidletterman 1
## 216 24 describe 1
## 217 24 producer 1
## 218 24 thursday 1
## 219 24 weasel 1
## 220 25 burn 1
## 221 25 episode 1
## 222 25 fill 1
## 223 25 fresh 1
## 224 25 game 1
## 225 25 network 1
## 226 25 prime 1
## 227 25 rerun 1
## 228 25 schedule 1
## 229 25 start 1
## 230 25 time 1
## 231 26 attention 1
## 232 26 extend 1
## 233 26 fight 1
## 234 26 future 1
## 235 26 grim 1
## 236 26 hold 1
## 237 26 leave 1
## 238 26 period 1
## 239 26 schedule 1
## 240 26 season 1
## 241 26 viewer 1
## 242 26 week 1
## 243 27 begin 1
## 244 27 contract 1
## 245 27 july 1
## 246 27 negotiation 1
## 247 27 remain 1
## 248 27 writer 1
## 249 28 break 1
## 250 28 bring 1
## 251 28 compensate 1
## 252 28 deadlock 1
## 253 28 digital 1
## 254 28 federal 1
## 255 28 format 1
## 256 28 issue 1
## 257 28 key 1
## 258 28 mediator 1
## 259 28 reuse 1
## 260 28 week 1
## 261 28 writer 1
## 262 29 cost 1
## 263 29 demand 1
## 264 29 download 1
## 265 29 dvds 1
## 266 29 growth 1
## 267 29 internet 1
## 268 29 piracy 1
## 269 29 production 1
## 270 29 profit 1
## 271 29 residual 1
## 272 29 rise 1
## 273 29 stifle 1
## 274 29 studio 1
## 275 29 tight 1
## 276 29 time 1
## 277 29 union 1
## 278 30 develop 1
## 279 30 digital 1
## 280 30 distribution 1
## 281 30 experimental 1
## 282 30 insist 1
## 283 30 medium 1
## 284 30 movie 1
## 285 30 promotional 1
## 286 30 remain 1
## 287 31 accuse 1
## 288 31 argue 1
## 289 31 deal 1
## 290 31 dvds 1
## 291 31 fair 1
## 292 31 lucrative 1
## 293 31 plead 1
## 294 31 poverty 1
## 295 31 studio 1
## 296 31 union 1
## 297 31 writer 1
## 298 32 film 1
## 299 32 internet 1
## 300 32 migrate 1
## 301 32 platform 1
## 302 32 revenue 1
## 303 32 share 1
## 304 32 wireless 1
## 305 33 additional 1
## 306 33 deangoodman 1
## 307 33 write 1
df_lemma = df %>%
mutate(word = str_remove_all(word,"'s$")) %>%
mutate(word = str_remove_all(word,"^the"))%>%
mutate(word = str_remove_all(word,"^The"))%>%
mutate(word = textstem::lemmatize_words(word)) %>%
mutate(word= tolower(word)) %>%
anti_join(stop_words) %>%
filter(nchar(word) > 2) %>%
purrr::set_names('id','word','n') %>%
dplyr::count(id, word)
## Joining, by = "word"
dtm = df_lemma %>%
tidytext::cast_dtm(document=id, term=word, value=n)
dtm
## <<DocumentTermMatrix (documents: 33, terms: 218)>>
## Non-/sparse entries: 307/6887
## Sparsity : 96%
## Maximal term length: 45
## Weighting : term frequency (tf)
mod = LDA_optimal(dtm, 2, 10, 5)
mod$Plot
terms = terms(mod$min_perp$model, k=20) %>%
as.data.frame() %>%
gather( topic, word,`Topic 1`:`Topic 4`, factor_key=FALSE) %>%
left_join(
df
) %>%
filter(!is.na(n)) %>%
select(-id) %>%
group_by(topic, word) %>%
mutate(n = sum(n))%>%
unique()
## Joining, by = "word"
terms(mod$stationary_prep$model, k=20) %>%
as.data.frame()
## Topic 1 Topic 2 Topic 3 Topic 4
## 1 negotiator strike union studio
## 2 reach time writer writersguild
## 3 talk deal revenue film
## 4 night start cost call
## 5 a.m week million internet
## 6 late issue hollywood hour
## 7 schedule season demand monday
## 8 air movie accuse screenwriter
## 9 labor episode thursday dvd
## 10 reuters prime industry share
## 11 u.s producer u.s budge
## 12 writersguildofamerica move weekend proposal
## 13 dvd peace deadlock unreasonable
## 14 board represent follow meet
## 15 govern shatter internet johnbowman
## 16 meet cover month irresponsible
## 17 membership day boisterous statement
## 18 recommendation deadline ratify fair
## 19 vote friday urge est
## 20 spokesman deadlock walkout newyorkcity
## Topic 5
## 1 contract
## 2 losangeles
## 3 billion
## 4 begin
## 5 remain
## 6 digital
## 7 move
## 8 expire
## 9 walkout
## 10 sunday
## 11 prolong
## 12 chairman
## 13 committee
## 14 decisively
## 15 bowman
## 16 damage
## 17 quickly
## 18 allianceofmotionpictureandtelevisionproducers
## 19 arm
## 20 bargain
wordcloud(words = terms$word, freq = terms$n, min.freq = 1,
max.words=40, random.order=FALSE, rot.per=0.1,
ordered.colors=TRUE,
colors=brewer.pal(8, "Dark2")[factor(terms$topic)])
library(LDAvis)